%matplotlib inline
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
import nltk
import string
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import Normalizer
import pickle
from tqdm import tqdm
import os
from prettytable import PrettyTable
import csv
import math
from chart_studio import plotly
import plotly.offline as offline
import plotly.graph_objs as go
offline.init_notebook_mode()
from collections import Counter
data_old=pd.read_csv('train_data.csv')
data_old.head(2)
data=pd.read_csv("preprocessed_data.csv")
data.head()
data['project_title'] = data_old['project_title'].values
data.head(2)
import nltk
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer
sent_int = SentimentIntensityAnalyzer()
# https://analyticsindiamag.com/sentiment-analysis-made-easy-using-vader/#:~:text=The%20compound%20score%20is%20the,%25%20Negative%2C%2050.8%25%20Neutral.
negative = []
positive = []
neutral = []
compound = []
def update_sentiments(values):
negative.append(values["neg"])
positive.append(values["pos"])
neutral.append(values["neu"])
compound.append(values["compound"])
from tqdm import tqdm
for essay in tqdm(data["essay"]):
update_sentiments(sent_int.polarity_scores(essay))
data["negative"] = negative
data["positive"] = positive
data["neutral"] = neutral
data["compound"] = compound
data.head()
stopwords= ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've",\
"you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', \
'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their',\
'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', \
'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', \
'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', \
'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after',\
'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further',\
'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more',\
'most', 'other', 'some', 'such', 'only', 'own', 'same', 'so', 'than', 'too', 'very', \
's', 't', 'can', 'will', 'just', 'don', "don't", 'should', "should've", 'now', 'd', 'll', 'm', 'o', 're', \
've', 'y', 'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn',\
"hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn',\
"mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", \
'won', "won't", 'wouldn', "wouldn't"]
def decontracted(phrase):
# specific
phrase = re.sub(r"won't", "will not", phrase)
phrase = re.sub(r"can\'t", "can not", phrase)
# general
phrase = re.sub(r"n\'t", " not", phrase)
phrase = re.sub(r"\'re", " are", phrase)
phrase = re.sub(r"\'s", " is", phrase)
phrase = re.sub(r"\'d", " would", phrase)
phrase = re.sub(r"\'ll", " will", phrase)
phrase = re.sub(r"\'t", " not", phrase)
phrase = re.sub(r"\'ve", " have", phrase)
phrase = re.sub(r"\'m", " am", phrase)
return phrase
from tqdm import tqdm
import re
def preprocess_text(text_data):
preprocessed_text = []
# tqdm is for printing the status bar
for sentance in tqdm(text_data):
sent = decontracted(sentance)
sent = sent.replace('\\r', ' ')
sent = sent.replace('\\n', ' ')
sent = sent.replace('\\"', ' ')
sent = re.sub('[^A-Za-z0-9]+', ' ', sent)
# https://gist.github.com/sebleier/554280
sent = ' '.join(e for e in sent.split() if e.lower() not in stopwords)
preprocessed_text.append(sent.lower().strip())
return preprocessed_text
data['clean_project_title']=preprocess_text(data['project_title'].values)
data.drop(['project_title'],axis='columns',inplace=True)
data.head()
X=data[:50000]
y=X['project_is_approved'].values
#data.drop(['project_is_approved'],axis='columns',inplace=True)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, stratify=y,random_state=42)
print(len(X_train))
print(len(X_test))
vect_tfidf_1 = TfidfVectorizer(min_df=10, ngram_range=(1,4), max_features=5000)
def TFIDF_Vectorization(train_column, test_column):
vect_tfidf_1.fit(train_column)
X_train_tfidf = vect_tfidf_1.transform(train_column)
X_test_tfidf = vect_tfidf_1.transform(test_column)
return (X_train_tfidf, X_test_tfidf)
train1 = X_train['essay'].values
test1=X_test['essay'].values
X_train_essay,X_test_essay = tqdm(TFIDF_Vectorization(train1,test1))
print("After vectorizations")
print(X_train_essay.shape, y_train.shape)
print(X_test_essay.shape, y_test.shape)
train2 = X_train['clean_project_title'].values
test2=X_test['clean_project_title'].values
X_train_tit,X_test_tit = tqdm(TFIDF_Vectorization(train2,test2))
print("After vectorizations")
print(X_train_tit.shape, y_train.shape)
print(X_test_tit.shape, y_test.shape)
###http://www.jessicayung.com/how-to-use-pickle-to-save-and-load-variables-in-python/
import pickle
with open (r'glove_vectors', "rb") as f:
model = pickle.load(f)
glove_words = set(model.keys())
tfidf_model = TfidfVectorizer()
tfidf_model.fit(X_train['essay'])
tfidf_feat=tfidf_model.get_feature_names()
# we are converting a dictionary with word as a key, and the idf as a value
dictionary = dict(zip(tfidf_model.get_feature_names(), list(tfidf_model.idf_)))
tfidf_words = set(tfidf_model.get_feature_names())
## https://medium.com/analytics-vidhya/featurization-of-text-data-bow-tf-idf-avgw2v-tfidf-weighted-w2v-7a6c62e8b097#:~:text=TFIDF%20weighted%20Word2Vec,sum%20by%20sum%20tfidf%20value.
from scipy import sparse
train_tfidf_w2v_vectors = [];
for sentence in tqdm(X_train['essay']):
vector = np.zeros(300)
tf_idf_weight =0;
for word in sentence.split():
if (word in glove_words) and (word in tfidf_words):
vec = model[word]
tf_idf = dictionary[word]*(sentence.count(word)/len(sentence.split()))
vector += (vec * tf_idf)
tf_idf_weight += tf_idf
if tf_idf_weight != 0:
vector /= tf_idf_weight
train_tfidf_w2v_vectors.append(vector)
### https://machinelearningmastery.com/sparse-matrices-for-machine-learning/
train_tfidf_w2v_vectors=sparse.csr_matrix(train_tfidf_w2v_vectors)
print((train_tfidf_w2v_vectors.shape))
test_tfidf_w2v_vectors = [];
for sentence in tqdm(X_test['essay']):
vector = np.zeros(300)
tf_idf_weight =0;
for word in sentence.split():
if (word in glove_words) and (word in tfidf_words):
vec = model[word]
tf_idf = dictionary[word]*(sentence.count(word)/len(sentence.split()))
vector += (vec * tf_idf)
tf_idf_weight += tf_idf
if tf_idf_weight != 0:
vector /= tf_idf_weight
test_tfidf_w2v_vectors.append(vector)
## https://machinelearningmastery.com/sparse-matrices-for-machine-learning/
test_tfidf_w2v_vectors=sparse.csr_matrix(test_tfidf_w2v_vectors)
print((test_tfidf_w2v_vectors.shape))
tfidf_model1 = TfidfVectorizer()
tfidf_model1.fit(X_train['clean_project_title'])
tfidf_feat=tfidf_model1.get_feature_names()
# we are converting a dictionary with word as a key, and the idf as a value
dictionary = dict(zip(tfidf_model1.get_feature_names(), list(tfidf_model1.idf_)))
tfidf_words = set(tfidf_model1.get_feature_names())
train_tfidf_w2v_vec = [];
for sentence in tqdm(X_train['clean_project_title']):
vector = np.zeros(300)
tf_idf_weight =0;
for word in sentence.split():
if (word in glove_words) and (word in tfidf_words):
vec = model[word]
tf_idf = dictionary[word]*(sentence.count(word)/len(sentence.split()))
vector += (vec * tf_idf)
tf_idf_weight += tf_idf
if tf_idf_weight != 0:
vector /= tf_idf_weight
train_tfidf_w2v_vec.append(vector)
## https://machinelearningmastery.com/sparse-matrices-for-machine-learning/
train_tfidf_w2v_vec= sparse.csr_matrix(train_tfidf_w2v_vec)
print(train_tfidf_w2v_vec.shape)
test_tfidf_w2v_vec = []; # the avg-w2v for each sentence/review is stored in this list
for sentence in tqdm(X_test['clean_project_title']):
vector = np.zeros((300))
tf_idf_weight =0;
for word in sentence.split():
if (word in glove_words) and (word in tfidf_words):
vec = model[word]
tf_idf = dictionary[word]*(sentence.count(word)/len(sentence.split()))
vector += (vec * tf_idf) # calculating tfidf weighted w2v
tf_idf_weight += tf_idf
if tf_idf_weight != 0:
vector /= tf_idf_weight
test_tfidf_w2v_vec.append(vector)
## https://machinelearningmastery.com/sparse-matrices-for-machine-learning/
test_tfidf_w2v_vec=sparse.csr_matrix(test_tfidf_w2v_vec)
print(test_tfidf_w2v_vec.shape)
## https://stackoverflow.com/questions/66122577/response-coding-for-categorical-data
def response_coding(xtrain, ytrain, feature):
dicary = dict()
labels = xtrain[feature].unique()
for i in tqdm(range(len(labels))):
total_count = xtrain.loc[:,feature][(xtrain[feature] == labels[i])].count()
p_0 = xtrain.loc[:, feature][((xtrain[feature] == labels[i]) & (ytrain==0))].count()
p_1 = xtrain.loc[:, feature][((xtrain[feature] == labels[i]) & (ytrain==1))].count()
dicary[labels[i]] = [p_1/total_count, p_0/total_count]
return dicary
# For train set
def transform(feature, df ):
diction = response_coding(X_train,y_train,feature)
count_val = df[feature].value_counts()
f_list=[]
for c in df[feature]:
if c in dict( count_val ).keys():# transform test data with trainning probabilities
f_list.append( diction[c] )
else:
f_list.append([0.5, 0.5])
return f_list
# For test set
def transform1(feature, df ):
diction = response_coding(X_test,y_test,feature)
count_val = df[feature].value_counts()
f_list=[]
for c in df[feature]:
if c in dict( count_val ).keys():# transform test data with trainning probabilities
f_list.append( diction[c] )
else:
f_list.append([0.5, 0.5])
return f_list
X_train_state_transform = transform('school_state',X_train)
X_test_state_transform = transform1('school_state',X_test)
X_train_state_transform = np.array(X_train_state_transform)
X_test_state_transform = np.array(X_test_state_transform)
print("After vectorizations")
print(X_train_state_transform.shape, y_train.shape)
print(X_test_state_transform.shape, y_test.shape)
X_train_prefix_transform = transform('teacher_prefix',X_train)
X_test_prefix_transform = transform1('teacher_prefix',X_test)
X_train_prefix_transform = np.array(X_train_prefix_transform)
X_test_prefix_transform = np.array(X_test_prefix_transform)
print("After vectorizations")
print(X_train_prefix_transform.shape, y_train.shape)
print(X_test_prefix_transform.shape, y_test.shape)
X_train_grade_transform = transform('project_grade_category',X_train)
X_test_grade_transform = transform1('project_grade_category',X_test)
X_train_grade_transform = np.array(X_train_grade_transform)
X_test_grade_transform = np.array(X_test_grade_transform)
print("After vectorizations")
print(X_train_grade_transform.shape, y_train.shape)
print(X_test_grade_transform.shape, y_test.shape)
X_train_categories_transform = transform('clean_categories',X_train)
X_test_categories_transform = transform1('clean_categories',X_test)
X_train_categories_transform = np.array(X_train_categories_transform)
X_test_categories_transform = np.array(X_test_categories_transform)
print("After vectorizations")
print(X_train_categories_transform.shape, y_train.shape)
print(X_test_categories_transform.shape, y_test.shape)
X_train_sub_transform = transform('clean_subcategories',X_train)
X_test_sub_transform = transform1('clean_subcategories',X_test)
X_train_sub_transform = np.array(X_train_sub_transform)
X_test_sub_transform = np.array(X_test_sub_transform)
print("After vectorizations")
print(X_train_sub_transform.shape, y_train.shape)
print(X_test_sub_transform.shape, y_test.shape)
def numerical_data(train_value,test_value):
normalizer=Normalizer()
normalizer.fit(train_value)
X_train_transform_norm=normalizer.transform(train_value)
X_test_transform_norm=normalizer.transform(test_value)
return X_train_transform_norm,X_test_transform_norm
train7=X_train['price'].values.reshape(-1,1)
test7=X_test['price'].values.reshape(-1,1)
X_train_price_transform,X_test_price_transform = numerical_data(train7,test7)
print("After vectorizations")
print(X_train_price_transform.shape, y_train.shape)
print(X_test_price_transform.shape, y_test.shape)
train8=X_train['teacher_number_of_previously_posted_projects'].values.reshape(-1,1)
test8=X_test['teacher_number_of_previously_posted_projects'].values.reshape(-1,1)
X_train_prev_transform,X_test_prev_transform = numerical_data(train8,test8)
print("After vectorizations")
print(X_train_prev_transform.shape, y_train.shape)
print(X_test_prev_transform.shape, y_test.shape)
train9=X_train['negative'].values.reshape(-1,1)
test9=X_test['negative'].values.reshape(-1,1)
X_train_neg,X_test_neg = numerical_data(train9,test9)
print("After vectorizations")
print(X_train_neg.shape, y_train.shape)
print(X_test_neg.shape, y_test.shape)
train10=X_train['positive'].values.reshape(-1,1)
test10=X_test['positive'].values.reshape(-1,1)
X_train_pos,X_test_pos = numerical_data(train10,test10)
print("After vectorizations")
print(X_train_pos.shape, y_train.shape)
print(X_test_pos.shape, y_test.shape)
train11=X_train['neutral'].values.reshape(-1,1)
test11=X_test['neutral'].values.reshape(-1,1)
X_train_neu,X_test_neu = numerical_data(train11,test11)
print("After vectorizations")
print(X_train_neu.shape, y_train.shape)
print(X_test_neu.shape, y_test.shape)
train12=X_train['compound'].values.reshape(-1,1)
test12=X_test['compound'].values.reshape(-1,1)
X_train_compound,X_test_compound = numerical_data(train12,test12)
print("After vectorizations")
print(X_train_compound.shape, y_train.shape)
print(X_test_compound.shape, y_test.shape)
from scipy.sparse import hstack
X_train_set1=hstack((X_train_state_transform, \
X_train_prefix_transform, \
X_train_grade_transform, \
X_train_categories_transform, \
X_train_sub_transform, \
X_train_price_transform, \
X_train_prev_transform, \
X_train_essay, \
X_train_compound, \
X_train_neu, \
X_train_pos, \
X_train_neg, \
X_train_tit)).tocsr()
X_test_set1=hstack((X_test_state_transform, \
X_test_prefix_transform, \
X_test_grade_transform, \
X_test_categories_transform, \
X_test_sub_transform, \
X_test_price_transform, \
X_test_prev_transform, \
X_test_essay, \
X_test_compound, \
X_test_pos, \
X_test_neg, \
X_test_neu, \
X_test_tit)).tocsr()
from scipy.sparse import hstack
X_train_set2=hstack((X_train_state_transform, \
X_train_prefix_transform, \
X_train_grade_transform, \
X_train_categories_transform, \
X_train_sub_transform, \
X_train_price_transform, \
X_train_prev_transform, \
train_tfidf_w2v_vec, \
train_tfidf_w2v_vectors)).tocsr()
X_test_set2=hstack((X_test_state_transform, \
X_test_prefix_transform, \
X_test_grade_transform, \
X_test_categories_transform, \
X_test_sub_transform, \
X_test_price_transform, \
X_test_prev_transform, \
test_tfidf_w2v_vec, \
test_tfidf_w2v_vectors)).tocsr()
!pip install lightgbm
from lightgbm import LGBMClassifier
parameters = {"max_depth":[1,2,5,10],"n_estimators":[5,10,100,500] }
clf = GridSearchCV(LGBMClassifier(), parameters, cv=5, scoring='roc_auc',return_train_score=True,n_jobs=-1)
clf.fit(X_train_set1,y_train)
trainauc= clf.cv_results_['mean_train_score']
trainaucstd= clf.cv_results_['std_train_score']
cvauc = clf.cv_results_['mean_test_score']
cvaucstd= clf.cv_results_['std_test_score']
print('Best score: ',clf.best_score_)
print('Best Hyper parameters: ',clf.best_params_)
## https://www.geeksforgeeks.org/pandas-groupby-unstack/
## https://indianaiproduction.com/seaborn-heatmap/
## https://stackoverflow.com/questions/34162443/why-do-many-examples-use-fig-ax-plt-subplots-in-matplotlib-pyplot-python
param_max_depth= [1, 2, 5, 10]
param_n_estimators = [5, 10, 100, 500]
scores1 = pd.DataFrame(clf.cv_results_).groupby(['param_n_estimators', 'param_max_depth']).max().unstack()[['mean_test_score', 'mean_train_score']]
fig,ax = plt.subplots(1,2, figsize=(20,5))
sns.heatmap(scores1.mean_train_score, annot = True, fmt='.4g', ax=ax[0],cmap='coolwarm')
sns.heatmap(scores1.mean_test_score, annot = True, fmt='.4g', ax=ax[1],cmap='coolwarm')
ax[0].set_title('Train Set')
ax[1].set_title('Test Set')
plt.show()
lgb1 = LGBMClassifier(class_weight ="balanced",max_depth=2,n_estimators=500)
lgb1.fit(X_train_set1,y_train)
pred_ytrain1 = lgb1.predict_proba(X_train_set1) [:,1]
pred_ytest1 = lgb1.predict_proba(X_test_set1) [:,1]
trfpr1, trtpr1, trthres1 = roc_curve(y_train, pred_ytrain1)
tfpr1, ttpr1, tthres1 = roc_curve(y_test, pred_ytest1)
plt.plot(trfpr1, trtpr1, label="train AUC ="+str(auc(trfpr1,trtpr1)))
plt.plot(tfpr1, ttpr1, label="test AUC ="+str(auc(tfpr1,ttpr1)))
plt.legend()
plt.grid()
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ERROR PLOTS")
plt.show()
def best_threshold(threshold, fpr, tpr):
th = threshold[np.argmax(tpr*(1-fpr))]
print("the maximum value", max(tpr*(1-fpr)), "for threshold", np.round(th,3))
return th
def predict(prob, threshold):
predictions = []
for i in prob:
if i>=threshold:
predictions.append(1)
else:
predictions.append(0)
return predictions
best_thres = best_threshold(trthres1, trfpr1, trtpr1)
conf_mat1=metrics.confusion_matrix(y_train,predict(pred_ytrain1,best_thres))
print("CONFUSION MATRIX OF TRAIN DATA")
print(conf_mat1)
sns.heatmap(conf_mat1, annot=True, fmt='d',cmap='GnBu')
best_thres1 = best_threshold(tthres1, tfpr1, ttpr1)
conf_mat2=metrics.confusion_matrix(y_test,predict(pred_ytest1,best_thres1))
print("CONFUSION MATRIX OF Test DATA")
print(conf_mat2)
sns.heatmap(conf_mat2, annot=True, fmt='d',cmap='GnBu')
from lightgbm import LGBMClassifier
parameters = {"max_depth":[1,2,5,10],"n_estimators":[5,10,100,500] }
clf1 = GridSearchCV(LGBMClassifier(), parameters, cv=5, scoring='roc_auc',return_train_score=True,n_jobs=-1)
clf1.fit(X_train_set2,y_train)
trainauc= clf1.cv_results_['mean_train_score']
trainaucstd= clf1.cv_results_['std_train_score']
cvauc = clf1.cv_results_['mean_test_score']
cvaucstd= clf1.cv_results_['std_test_score']
print('Best score: ',clf1.best_score_)
print('Best Hyper parameters: ',clf1.best_params_)
## https://www.geeksforgeeks.org/pandas-groupby-unstack/
## https://indianaiproduction.com/seaborn-heatmap/
## https://stackoverflow.com/questions/34162443/why-do-many-examples-use-fig-ax-plt-subplots-in-matplotlib-pyplot-python
param_max_depth= [1, 2, 5, 10]
param_n_estimators = [5, 10, 100, 500]
scores2 = pd.DataFrame(clf1.cv_results_).groupby(['param_n_estimators', 'param_max_depth']).max().unstack()[['mean_test_score', 'mean_train_score']]
fig,ax = plt.subplots(1,2, figsize=(20,5))
sns.heatmap(scores2.mean_train_score, annot = True, fmt='.4g', ax=ax[0],cmap='coolwarm')
sns.heatmap(scores2.mean_test_score, annot = True, fmt='.4g', ax=ax[1],cmap='coolwarm')
ax[0].set_title('Train Set')
ax[1].set_title('Test Set')
plt.show()
lgb2 = LGBMClassifier(class_weight ="balanced",max_depth=2,n_estimators=500)
lgb2.fit(X_train_set2,y_train)
pred_ytrain2 = lgb2.predict_proba(X_train_set2) [:,1]
pred_ytest2 = lgb2.predict_proba(X_test_set2) [:,1]
trfpr2, trtpr2, trthres2 = roc_curve(y_train, pred_ytrain2)
tfpr2, ttpr2, tthres2 = roc_curve(y_test, pred_ytest2)
plt.plot(trfpr2, trtpr2, label="train AUC ="+str(auc(trfpr2,trtpr2)))
plt.plot(tfpr2, ttpr2, label="test AUC ="+str(auc(tfpr2,ttpr2)))
plt.legend()
plt.grid()
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ERROR PLOTS")
plt.show()
best_thres2 = best_threshold(trthres2, trfpr2, trtpr2)
conf_mat3=metrics.confusion_matrix(y_train,predict(pred_ytrain2,best_thres2))
print("CONFUSION MATRIX OF TRAIN DATA")
print(conf_mat3)
sns.heatmap(conf_mat3, annot=True, fmt='d',cmap='GnBu')
best_thres3 = best_threshold(tthres2,tfpr2, ttpr2)
conf_mat4=metrics.confusion_matrix(y_test,predict(pred_ytest2,best_thres3))
print("CONFUSION MATRIX OF TRAIN DATA")
print(conf_mat4)
sns.heatmap(conf_mat4, annot=True, fmt='d',cmap='GnBu')
# http://zetcode.com/python/prettytable/
from prettytable import PrettyTable
x = PrettyTable()
x.field_names = ["Vectorizer and Encoding", "Model", "Hyperparameters(max depth,min samples split)", "Test AUC"]
x.add_row(["TFIDF + Response Coding ", "LightGBM", "(2, 500)", 0.703])
x.add_row(["TFIDF W2V + Response Coding", "LightGBM", "(2, 500)", 0.680])
print(x)